資料分析2

林嶔 (Lin, Chin)

Lesson 6

第一節:卡方檢定(1)

– 請至這裡下載本週的範例資料

dat = read.csv("Example data.csv", header = TRUE)
head(dat)
##       eGFR Disease Survival.time Death Diabetes Cancer      SBP      DBP
## 1 34.65379       1     0.4771037     0        0      1 121.2353 121.3079
## 2 37.21183       1     3.0704424     0        1      1 122.2000 122.6283
## 3 32.60074       1     0.2607117     1        0      0 118.9136 121.7621
## 4 29.68481       1            NA    NA        0      0 118.2212 112.7043
## 5 28.35726       0     0.1681673     1        0      0 116.7469 115.7705
## 6 33.95012       1     1.2238556     0        0      0 119.9936 116.3872
##   Education Income
## 1         2      0
## 2         2      0
## 3         0      0
## 4         1      0
## 5         0      0
## 6         1      0

– 這時候我們需要使用函數「chisq.test()」及「fisher.test()」

TABLE1 = table(dat[,"Diabetes"], dat[,"Education"])
TABLE1
##    
##       0   1   2
##   0 403 231 148
##   1  89  54  37
#也可以這樣下指令,會得到完全一樣的結果
TABLE2 = table(dat[,c("Diabetes", "Education")])
TABLE2
##         Education
## Diabetes   0   1   2
##        0 403 231 148
##        1  89  54  37
Result1 = chisq.test(TABLE1)
Result1
## 
##  Pearson's Chi-squared test
## 
## data:  TABLE1
## X-squared = 0.33753, df = 2, p-value = 0.8447
Result2 = fisher.test(TABLE2)
Result2
## 
##  Fisher's Exact Test for Count Data
## 
## data:  TABLE2
## p-value = 0.8318
## alternative hypothesis: two.sided

第一節:卡方檢定(2)

ls(Result1)
## [1] "data.name" "expected"  "method"    "observed"  "parameter" "p.value"  
## [7] "residuals" "statistic" "stdres"
ls(Result2)
## [1] "alternative" "data.name"   "method"      "p.value"

第一節:卡方檢定(3)

– 把它寫成程式碼的話,計算期望值表的方法如下

TABLE1 = table(dat[,"Diabetes"], dat[,"Education"])
TABLE1
##    
##       0   1   2
##   0 403 231 148
##   1  89  54  37
marginal.n1 = rep(NA, nrow(TABLE1))
for (i in 1:nrow(TABLE1)) {
  marginal.n1[i] = sum(TABLE1[i,])
}
marginal.n1
## [1] 782 180
marginal.n2 = rep(NA, ncol(TABLE1))
for (i in 1:ncol(TABLE1)) {
  marginal.n2[i] = sum(TABLE1[,i])
}
marginal.n2
## [1] 492 285 185
N = sum(TABLE1)
N
## [1] 962
expected.TABLE = marginal.n1%*%t(marginal.n2)/N
expected.TABLE
##           [,1]     [,2]      [,3]
## [1,] 399.94179 231.6736 150.38462
## [2,]  92.05821  53.3264  34.61538
RESULT = expected.TABLE > 5
RESULT 
##      [,1] [,2] [,3]
## [1,] TRUE TRUE TRUE
## [2,] TRUE TRUE TRUE
sum(RESULT)/length(RESULT)
## [1] 1

練習1:類別變項分析流程

  1. 使用這可以任意輸入兩個等長變項做檢定(若不等長則警告並停止執行函數)

  2. 其中一個變項必須為類別變項,第二個變項隨使用者指定

  3. 若第二個變項為連續變項,則先考慮第一個變項的類別數,若類別數為2則走t test及Wilcoxon test路線;若類別數大於2則走ANOVA及Kruskal-Wallis test路線

  4. 若第二個變項為類別變項,則使用卡方檢定或Fisher exact test

– 註:請在函數內幫助使用者判斷該使用哪一種檢定法,並直接將報表寫出

練習1答案(1)

my_func.analysis = function (x, y) {
  
  lvl.x = levels(factor(x))
  n.lvl.x = length(lvl.x)
  
  lvl.y = levels(factor(y))
  n.lvl.y = length(lvl.y)
  
  if (n.lvl.y < 10) {
    
    TABLE1 = table(x, y)
    marginal.n1 = rep(NA, nrow(TABLE1))
    for (i in 1:nrow(TABLE1)) {
      marginal.n1[i] = sum(TABLE1[i,])
    }
    marginal.n2 = rep(NA, ncol(TABLE1))
    for (i in 1:ncol(TABLE1)) {
      marginal.n2[i] = sum(TABLE1[,i])
    }
    N = sum(TABLE1)
    expected.TABLE = marginal.n1%*%t(marginal.n2)/N
    expected.TABLE
    
    RESULT = expected.TABLE > 5
    
    if (sum(RESULT)/length(RESULT) > 0.8) {
      
      result = chisq.test(TABLE1)
      
    } else {
      
      result = fisher.test(TABLE1)
      
    }
    
  } else {
    
    n.each = numeric(n.lvl.x)
    
    for (i in 1:n.lvl.x) {
      
      current_vec = y[x == lvl.x[i]]
      n.each[i] = sum(table(current_vec))
      
    }
    
    
    if (n.lvl.x > 2) {
      
      if (FALSE %in% (n.each > 25)) {
        
        result = kruskal.test(y~x)
        
      } else {
        
        Variance.table = aov(y~as.factor(x))
        result = anova(Variance.table)
        
      }
      
    } else {
      
      if (FALSE %in% (n.each > 25)) {
        
        result = wilcox.test(y~x)
        
      } else {
        
        var.result = var.test(y~x)
        
        if (var.result[['p.value']] < 0.05) {
          
          result = t.test(y~x, var.equal = FALSE)
          
        } else {
          
          result = t.test(y~x, var.equal = TRUE)
          
        }
        
      }
      
    }
    
  }
  
  result
  
}

練習1答案(2)

my_func.analysis(dat$Income, dat$Education)
## 
##  Pearson's Chi-squared test
## 
## data:  TABLE1
## X-squared = 8.5353, df = 4, p-value = 0.07383
my_func.analysis(dat[dat$Income == 2 & dat$Diabetes == 1,]$Education, dat[dat$Income == 2 & dat$Diabetes == 1,]$Death)
## 
##  Fisher's Exact Test for Count Data
## 
## data:  TABLE1
## p-value = 0.3824
## alternative hypothesis: two.sided
my_func.analysis(dat$Disease, dat$eGFR)
## 
##  Welch Two Sample t-test
## 
## data:  y by x
## t = -2.4556, df = 466.06, p-value = 0.01443
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -2.2043011 -0.2445876
## sample estimates:
## mean in group 0 mean in group 1 
##        33.19008        34.41452
my_func.analysis(dat[dat$Income == 2,]$Disease, dat[dat$Income == 2,]$eGFR)
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  y by x
## W = 691, p-value = 0.2357
## alternative hypothesis: true location shift is not equal to 0
my_func.analysis(dat$Income, dat$eGFR)
## Analysis of Variance Table
## 
## Response: y
##               Df Sum Sq Mean Sq F value Pr(>F)
## as.factor(x)   2    102  51.016  1.0115 0.3641
## Residuals    944  47613  50.438
my_func.analysis(dat[dat$Income == 2,]$Education, dat[dat$Income == 2,]$eGFR)
## 
##  Kruskal-Wallis rank sum test
## 
## data:  y by x
## Kruskal-Wallis chi-squared = 8.7301, df = 2, p-value = 0.01271

期中報告(歡迎組隊繳交報告)

##   Variable Diabetes:0  Diabetes:1  p-value 
## 1 "eGFR"   "32.9±6.6" "39.8±6.5" "<0.001"
##   Variable      Diabetes:0   Diabetes:1  p-value
## 1 "Education"   ""           ""          "0.845"
## 2 "Education:0" "403(51.5%)" "89(49.4%)" ""     
## 3 "Education:1" "231(29.5%)" "54(30.0%)" ""     
## 4 "Education:2" "148(18.9%)" "37(20.6%)" ""
##    Variable      Diabetes:0   Diabetes:1   p-value 
## 1  "Education"   ""           ""           "0.845" 
## 2  "Education:0" "403(51.5%)" "89(49.4%)"  ""      
## 3  "Education:1" "231(29.5%)" "54(30.0%)"  ""      
## 4  "Education:2" "148(18.9%)" "37(20.6%)"  ""      
## 5  "eGFR"        "32.9±6.6"  "39.8±6.5"  "<0.001"
## 6  "Income"      ""           ""           "0.673" 
## 7  "Income:0"    "618(79.2%)" "141(77.5%)" ""      
## 8  "Income:1"    "77(9.9%)"   "22(12.1%)"  ""      
## 9  "Income:2"    "85(10.9%)"  "19(10.4%)"  ""      
## 10 "SBP"         "119.7±9.9" "130.3±9.8" "<0.001"
## 11 "DBP"         "120.0±9.6" "128.3±9.3" "<0.001"

第二節:相關性-1(1)

– 在不使用迴歸相關的技術前提下,我們只剩下相關系數能夠使用了

– 函數「cor.test()」可以用來計算相關係數

Result3 = cor.test(dat[,"DBP"],dat[,"SBP"], method = "pearson")
Result3
## 
##  Pearson's product-moment correlation
## 
## data:  dat[, "DBP"] and dat[, "SBP"]
## t = 53.178, df = 952, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.8480377 0.8801056
## sample estimates:
##       cor 
## 0.8649519
Result4 = cor.test(dat[,"DBP"],dat[,"SBP"], method = "spearman")
Result4
## 
##  Spearman's rank correlation rho
## 
## data:  dat[, "DBP"] and dat[, "SBP"]
## S = 21264000, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##       rho 
## 0.8530528

練習2:相關性分析實做

  1. 使用這可以任意輸入兩個等長連續變項做檢定(若不等長則警告並停止執行函數)

  2. 判斷Sample size是否小於25,如果大於則使用Pearson correlation,小於則使用Spearman correlation

練習2答案

my_func.correlation = function (x, y) {
  
  if (length(x) != length(y)) {
    
    cat("x與y不等長")
    
  } else {
    
    n.sample = sum(!is.na(x) & !is.na(y))
    
    if (n.sample >= 25) {
      
      result = cor.test(x, y, method = "pearson")

    } else {
      
      result = cor.test(x, y, method = "spearman")

    }
    
    result
    
  }
  
}
my_func.correlation(dat[,"DBP"],dat[,"SBP"])
## 
##  Pearson's product-moment correlation
## 
## data:  x and y
## t = 53.178, df = 952, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.8480377 0.8801056
## sample estimates:
##       cor 
## 0.8649519
my_func.correlation(dat[dat$Income == 2 & dat$Diabetes == 1,"DBP"], dat[dat$Income == 2 & dat$Diabetes == 1,"SBP"])
## 
##  Spearman's rank correlation rho
## 
## data:  x and y
## S = 214, p-value = 0.0002096
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##       rho 
## 0.7791538

第三節:相關性-2(1)

– 非連續變項的相關係數最常使用的是『Polychoric correlation』

F6_1

第三節:相關性-2(2)

– 我們可以透過下列方法安裝這個套件

F6_2

– 或是使用函數『install.packages()』安裝指定套件

install.packages("polycor")
library("polycor")

第三節:相關性-2(3)

第三節:相關性-2(4)

– 如果你看不懂每一行的指令在寫甚麼,請一行一行看,並了解該函數的input的格式是什麼。

if(require(mvtnorm)){
set.seed(12345)
data <- rmvnorm(1000, c(0, 0), matrix(c(1, .5, .5, 1), 2, 2))
x <- data[,1]
y <- data[,2]
cor(x, y) # sample correlation
}
## [1] 0.5263698
if(require(mvtnorm)){
x <- cut(x, c(-Inf, .75, Inf))
y <- cut(y, c(-Inf, -1, .5, 1.5, Inf))
polychor(x, y) # 2-step estimate
}
## [1] 0.5230474
if(require(mvtnorm)){
set.seed(12345)
polychor(x, y, ML=TRUE, std.err=TRUE) # ML estimate
}
## 
## Polychoric Correlation, ML est. = 0.5231 (0.03819)
## Test of bivariate normality: Chisquare = 2.739, df = 2, p = 0.2543
## 
##   Row Threshold
##   Threshold Std.Err.
##      0.7537  0.04403
## 
## 
##   Column Thresholds
##   Threshold Std.Err.
## 1   -0.9842  0.04746
## 2    0.4841  0.04127
## 3    1.5010  0.06118

第三節:相關性-2(5)

F6_3

– 但是可能因為臨床意義等原因(如空腹血糖126以上被定義為糖尿病),使這兩個變項被置換成類別/序位變項。

– 因此Polychoric correlation的想法是,想辦法將兩個類別變項還原成連續變項,再進行相關係數檢定。

– 所以在範例的程式碼,第一個部分是真的製造兩個連續變項,並做Pearson correlation;第二個部分則是強硬的把他切割成數份,並計算Polychoric correlation看答案是不是類似

if(require(mvtnorm)){
set.seed(12345)
data <- rmvnorm(1000, c(0, 0), matrix(c(1, .5, .5, 1), 2, 2))
x <- data[,1]
y <- data[,2]
cor(x, y) # sample correlation
}
## [1] 0.5263698
if(require(mvtnorm)){
x <- cut(x, c(-Inf, .75, Inf))
y <- cut(y, c(-Inf, -1, .5, 1.5, Inf))
polychor(x, y) # 2-step estimate
}
## [1] 0.5230474

第三節:相關性-2(6)

Result5 = polychor(dat[,"Disease"], dat[,"Diabetes"])
Result5
## [1] 0.1818627
Result6 = polyserial(dat[,"eGFR"], dat[,"Disease"])
Result6
## [1] 0.1042781
Result7 = polychor(dat[,"Disease"], dat[,"Diabetes"], std.err=TRUE)
Result7
## 
## Polychoric Correlation, 2-step est. = 0.1819 (0.06496)
Result8 = polyserial(dat[,"eGFR"], dat[,"Disease"], std.err=TRUE)
Result8
## 
## Polyserial Correlation, 2-step est. = 0.1063 (0.04478)
## Test of bivariate normality: Chisquare = 8.175, df = 5, p = 0.1468

– 在R裡面可以簡單的使用函數「pchisq()」求卡方分布的累積分布值,所以我們可以利用這個方式來算p value

Z = Result7$rho/sqrt(Result7$var)
CHISQ = Z^2
pchisq(CHISQ, df = 1, lower.tail = FALSE)
##             [,1]
## [1,] 0.005112662
pchisq(Result8$chisq, Result8$df, lower.tail = FALSE)
## [1] 0.1468378

練習3:相關係數分析的擴充功能

  1. 使用這可以任意輸入兩個等長變項做檢定(若不等長則警告並停止執行函數)

  2. 判斷使用者輸入的變項是類別(factor)或是連續,並且使用適當的方法檢定

  3. 如果是兩個都是連續變項,先判斷Sample size是否小於25,如果大於則使用Pearson correlation,小於則使用Spearman correlation

  4. 只要匯出三個資訊:(1)相關係數、(2)p value、(3)用什麼方法

練習4:進階挑戰

cor(dat[,c("eGFR", "SBP", "DBP")], use = "pairwise.complete.obs")
##           eGFR       SBP       DBP
## eGFR 1.0000000 0.9683834 0.9192064
## SBP  0.9683834 1.0000000 0.8649519
## DBP  0.9192064 0.8649519 1.0000000

– 如果厲害一點的話,請在相關矩陣的上半部填入p value,而下半部則填入相關係數。

練習3答案(1)

library("polycor")

my_func.correlation = function (x, y) {
  
  if (length(x) != length(y)) {
    
    cat("x與y不等長")
    
  } else {
    
    
    lvl.x = levels(factor(x))
    n.lvl.x = length(lvl.x)
    
    lvl.y = levels(factor(y))
    n.lvl.y = length(lvl.y)
    
    if (n.lvl.x > 10 & n.lvl.y > 10) {
      
      n.sample = sum(!is.na(x) & !is.na(y))
      
      if (n.sample >= 25) {
        
        result = cor.test(x, y, method = "pearson")
        method = "pearson"
        
      } else {
        
        result = cor.test(x, y, method = "spearman")
        method = "spearman"
        
      }
      
      rho = result$estimate
      p_val = result$p.value
      
    } else {
      
      if (n.lvl.x < 10) {
        
        if (n.lvl.y < 10) {
          
          result = polychor(x, y, std.err=TRUE)
          Z = result$rho/sqrt(result$var)
          CHISQ = Z^2
          p_val = pchisq(CHISQ, df = 1, lower.tail = FALSE)
          method = "polychor"
          
        } else {
          
          result = polyserial(y, x, std.err=TRUE)
          p_val = pchisq(result$chisq, result$df, lower.tail = FALSE)
          method = "polyserial"
          
        }
        
      } else {
        
        result = polyserial(x, y, std.err=TRUE)
        p_val = pchisq(result$chisq, result$df, lower.tail = FALSE)
        method = "polyserial"
        
      }
      
      rho = result$rho
      
    }
    
    my_list = list(rho, p_val, method)
    names(my_list) = c('rho', 'p_val', 'method')
    
    my_list
    
  }
  
}

練習3答案(2)

my_func.correlation(dat[,"DBP"],dat[,"SBP"])
## $rho
##       cor 
## 0.8649519 
## 
## $p_val
## [1] 2.665107e-287
## 
## $method
## [1] "pearson"
my_func.correlation(dat[dat$Income == 2 & dat$Diabetes == 1,"DBP"], dat[dat$Income == 2 & dat$Diabetes == 1,"SBP"])
## $rho
##       rho 
## 0.7791538 
## 
## $p_val
## [1] 0.0002095772
## 
## $method
## [1] "spearman"
my_func.correlation(dat[,"Income"],dat[,"SBP"])
## $rho
## [1] 0.001014369
## 
## $p_val
## [1] 0.8523447
## 
## $method
## [1] "polyserial"
my_func.correlation(dat[,"Income"],dat[,"Disease"])
## $rho
## [1] 0.008209949
## 
## $p_val
##           [,1]
## [1,] 0.8935464
## 
## $method
## [1] "polychor"
my_func.correlation(dat[,"eGFR"],dat[,"Disease"])
## $rho
## [1] 0.1063003
## 
## $p_val
## [1] 0.1468378
## 
## $method
## [1] "polyserial"

練習4答案(1)

my_func.correlation_matrix = function (data) {
  
  correlation_matrix = matrix(1, nrow = ncol(data), ncol = ncol(data))
  rownames(correlation_matrix) = colnames(data)
  colnames(correlation_matrix) = colnames(data)
  
  for (i in 1:ncol(data)) {
    
    for (j in 1:ncol(data)) {
      
      if (j < i) {
        
        result_list = my_func.correlation(data[,i],data[,j])
        correlation_matrix[j,i] = result_list$rho
        correlation_matrix[i,j] = result_list$p_val
        
      }
      
    }
    
  }
  
  correlation_matrix
  
}

練習4答案(2)

my_func.correlation_matrix(data = dat)
##                    eGFR       Disease Survival.time        Death
## eGFR          1.0000000  1.063003e-01  2.217909e-02 -0.077650458
## Disease       0.1468378  1.000000e+00  5.550212e-02 -0.001989727
## Survival.time 0.5046980 1.961238e-208  1.000000e+00  0.043674513
## Death         0.2418072  9.720354e-01 6.345820e-221  1.000000000
## Diabetes      0.1589850  5.112662e-03 6.547588e-202  0.786730236
## Cancer        0.1153516  9.857535e-01 3.406996e-215  0.585043216
## SBP           0.0000000  7.060265e-01  4.046682e-01  0.234451469
## DBP           0.0000000  3.396927e-02  6.293485e-01  0.199337449
## Education     0.3037279  1.264179e-01 6.793488e-220  0.315127602
## Income        0.4530232  8.935464e-01 4.130601e-215  0.750710303
##                 Diabetes       Cancer            SBP         DBP
## eGFR          0.55061563  0.036718500   9.683834e-01  0.91920635
## Disease       0.18187368  0.001003631   1.147569e-01  0.07783083
## Survival.time 0.02964107 -0.006368456   2.759606e-02 -0.01588972
## Death         0.01621756 -0.028495907  -7.325610e-02 -0.06317791
## Diabetes      1.00000000  0.082807411   5.587485e-01  0.47481803
## Cancer        0.16045071  1.000000000   4.945587e-02  0.03467301
## SBP           0.63326768  0.528875761   1.000000e+00  0.86495187
## DBP           0.68352697  0.402321710  2.665107e-287  1.00000000
## Education     0.56456435  0.355532672   7.483356e-01  0.76053101
## Income        0.72928037  0.444110047   8.523447e-01  0.35923294
##                Education       Income
## eGFR          0.06375443  0.051815168
## Disease       0.07781775  0.008209949
## Survival.time 0.51042244  0.026751086
## Death         0.04756400  0.017904954
## Diabetes      0.03098767  0.022109389
## Cancer        0.04330285  0.042625790
## SBP           0.05110441  0.001014369
## DBP           0.02460452 -0.020832642
## Education     1.00000000 -0.021262920
## Income        0.67511363  1.000000000

小結

– Cox比例風險模式需要安裝額外的套件才能運算,請同學回家後找一下是用哪一個套件,熟讀範例並弄清楚該怎麼操作,下次上課我們會從同學的報告先開始。